# Keep things nice and tidy, all libraries go here
library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(svglite)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggsci)
data <- read_excel("data/data_IEEE.xlsx", skip = 1)
## New names:
## * `` -> ...35
data <- data %>% filter(is.na(Exclude))
# Years without any publication (for easy slicing)
years_no_publications <- c("1974",
"1975",
"1976",
"1978")
# LABELS so slicing will not become a mess
swebok_areas_labels = c("SR",
"SD",
"SC",
"ST",
"SM",
"SCM",
"SEM",
"SEP",
"SEMM",
"SQ",
"SEPP",
"SEE",
"CF",
"MF",
"EF")
swebok_areas_labels_no_foundation = c("SR",
"SD",
"SC",
"ST",
"SM",
"SCM",
"SEM",
"SEP",
"SEMM",
"SQ",
"SEPP",
"SEE")
swebok_areas_labels_long = c("Requirements",
"Design",
"Construction",
"Testing",
"Maintainance",
"Config. Mgmt.",
"SE Mgmt.",
"SE Processes",
"SE Models&Methods",
"Software Quality",
"SE Prof. Practice",
"SE Economics")
cognitive_concepts_labels <- c("Attention",
"Selective attention",
"Divided attention",
"Sustained attention",
"Memory",
"Working memory",
"Short-term memory",
"Long-term memory",
"Cognitive load",
# "Cognitive control",
"Intrinsic CL",
"Extrinsic CL",
"Perception",
"Problem solving",
"Reasoning",
"Decision making",
"Cognitive biases",
"Knowledge",
"Explicit knowledge",
"Tacit knowledge",
"Techn. tacit knowl.",
"Cogn. tacit knowl.")
measures_labels <- c("Qualit. measures",
"Fieldwork",
"Interview",
"Task-based",
"Open observation",
"Quantit. measures",
"Task performance",
"Physiological meas.",
"Subjective ratings",
"Behavioral meas.")
# COLORS
tol9qualitative=c("#332288",
"#88CCEE",
"#44AA99",
"#117733",
"#999933",
"#DDCC77",
"#CC6677",
"#882255",
"#AA4499")
NPG_modified=c("#F5E144",
"#4DBBD5FF",
"#00A087FF",
"#3C5488FF",
"#F39B7FFF",
"#8491B4FF",
"#91D1C2FF",
"#DC0000FF",
"#7E6148FF")
# Necessary for groupying by high-level category
add_high_level_concepts_to_data <- function(data) {
data %>%
mutate(Concept = case_when(
Taxonomy %in% c("Attention", "Selective attention", "Divided attention", "Sustained attention") ~ "Attention",
Taxonomy %in% c("Memory", "Working memory", "Short-term memory", "Long-term memory") ~ "Memory",
Taxonomy %in% c("Cognitive control", "Cognitive load", "Extrinsic CL", "Intrinsic CL") ~ "Cognitive load",
Taxonomy == "Perception" ~ "Perception",
Taxonomy %in% c("Problem solving", "Reasoning", "Decision making") ~ "Reasoning",
Taxonomy %in% c("Cognitive biases") ~ "Cognitive biases",
Taxonomy %in% c("Knowledge", "Explicit knowledge", "Tacit knowledge", "Techn. tacit knowl.", "Cogn. tacit knowl.") ~ "Knowledge",
))
}
ggplot(data, aes(x=as.factor(Year))) +
geom_bar() +
ylab("Number of publications") +
xlab("Year") +
geom_text(stat='count', aes(label=..count..), vjust=2, color="white", size = 2.5) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("PDFs/yearly_distribution.pdf")
## Saving 7 x 5 in image
# Cleaning not needed values
data<-data %>%
mutate(Academia = replace(Academia, Academia == "?", NA)) %>%
mutate(Industry = replace(Industry, Industry == "?", NA))
data<-data %>%
mutate(Type = case_when(is.na(Academia) & is.na(Industry) ~ "None",
Academia == "1" & is.na(Industry) ~ "Academia",
Industry == "1" & is.na(Academia) ~ "Industry",
TRUE ~ "Both"))
data %>%
mutate(Type = fct_infreq(Type, ordered = T)) %>%
ggplot(aes(x=Type)) +
geom_bar(width = .5) +
xlab("Type of publication") +
ylab("Number of publications") +
geom_text(stat='count', aes(label=..count..), vjust=3, color="white", size = 4) +
theme_bw()
ggsave("PDFs/academia_industry_distribution.pdf")
## Saving 7 x 5 in image
A publication can be in more than one category at the same time.
data %>%
select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
mutate_all(replace_na,0) %>%
summarise_all(sum) %>%
gather(key = "SWEBOKArea", value = "publications", 1:15) %>%
arrange(-publications) %>%
mutate(SWEBOKArea = factor(SWEBOKArea, SWEBOKArea)) %>%
ggplot(aes(x=SWEBOKArea, y=publications)) +
geom_bar(stat="identity") +
geom_text(aes(label=publications), vjust=-0.3, color="black", size = 4) +
xlab("SWEBoK Area") +
ylab("Number of publications") +
theme_bw()
ggsave("PDFs/swebok_distribution.pdf")
## Saving 7 x 5 in image
swebokareas<-data %>%
select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
mutate_all(replace_na,0) %>%
as.matrix() %>%
crossprod()
swebokareas %>%
kable()
| SR | SD | SC | ST | SM | SCM | SEM | SEP | SEMM | SQ | SEPP | SEE | CF | MF | EF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SR | 49 | 18 | 5 | 2 | 4 | 0 | 7 | 2 | 4 | 0 | 7 | 0 | 0 | 0 | 1 |
| SD | 18 | 66 | 17 | 3 | 4 | 0 | 6 | 2 | 6 | 1 | 6 | 0 | 0 | 0 | 1 |
| SC | 5 | 17 | 77 | 5 | 22 | 1 | 3 | 2 | 2 | 0 | 3 | 0 | 0 | 0 | 0 |
| ST | 2 | 3 | 5 | 12 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| SM | 4 | 4 | 22 | 4 | 46 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| SCM | 0 | 0 | 1 | 0 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| SEM | 7 | 6 | 3 | 1 | 2 | 0 | 26 | 3 | 1 | 0 | 7 | 3 | 0 | 0 | 1 |
| SEP | 2 | 2 | 2 | 0 | 1 | 1 | 3 | 10 | 0 | 0 | 2 | 1 | 0 | 0 | 0 |
| SEMM | 4 | 6 | 2 | 0 | 0 | 0 | 1 | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 0 |
| SQ | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 |
| SEPP | 7 | 6 | 3 | 0 | 1 | 0 | 7 | 2 | 1 | 0 | 18 | 3 | 0 | 0 | 1 |
| SEE | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 3 | 5 | 0 | 0 | 0 |
| CF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| MF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EF | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
plot_ly(x=swebok_areas_labels, y=swebok_areas_labels, z=swebokareas, type="heatmap")
x <- data %>% select(all_of(swebok_areas_labels), all_of(cognitive_concepts_labels)) %>%
mutate_all(replace_na, 0) %>%
mutate(`Problem solving`, `Problem solving` = as.numeric(`Problem solving`)) %>%
gather(key="SWEBOK", value = pubs, swebok_areas_labels) %>% # use SWEBOK area as factor
filter(pubs > 0) %>% # select areas for which there are publications
group_by(SWEBOK) %>%
summarise_all(sum) %>% # number of publication for each area
select(-pubs) %>% # remove pubs to reuse it later
gather(key = "Taxonomy", value = "count", cognitive_concepts_labels) %>% # count publications in each cognitive taxonomy area
mutate(label = str_replace(as.character(count), "^0", "")) # add label for later
## Warning: NAs introduced by coercion
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels)` instead of `swebok_areas_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cognitive_concepts_labels)` instead of `cognitive_concepts_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Bubble plot
x <- arrange(x, Taxonomy)
xf<-x$Taxonomy
xfu<-unique(xf)
x$Taxonomy<-factor(xf,levels=xfu)
p<-ggplot(x)
p + geom_point(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), size=count), shape=21, fill="white", alpha=0.60) +
geom_text(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), label=label), size=2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size=9,colour="black"), axis.text.y = element_text(size=8,colour="black"), axis.title.x = element_text(size=10), axis.title.y = element_text(size=10,colour = "black",vjust=0.12), panel.grid.major = element_line(linetype = "dashed", size=0.1, color="black"))+
labs(x="SWEBOK Area",y = "Taxonomy Area") + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
ggsave("PDFs/swebok_taxonomy_bubble.pdf")
## Saving 7 x 5 in image
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
# Preparing the dataset for analysing the research methods
data<-data %>%
mutate(Quantitative = case_when(`Quantit. measures` == 1 | `Task performance` == 1 | `Physiological meas.` == 1 | `Subjective ratings` == 1 | `Behavioral meas.` == 1 ~ 1)) %>%
mutate(Quantitative = replace_na(Quantitative, 0)) %>%
mutate(Qualitative = case_when(Fieldwork == 1 | Interview == 1 | `Qualit. measures` == 1 | `Task-based` == 1 | `Open observation` == 1 ~ 1)) %>%
mutate(Qualitative = replace_na(Qualitative, 0)) %>%
mutate(Both = if_else(Qualitative == 1 & Quantitative == 1, 1, 0))
Number of publications per year according to SWEBOK areas
# Creating a temp dataset with missing publications years (i.e., year for which there was no publication)
data %>%
filter(is.na(Exclude)) %>%
select(c(Year, SR:EF)) %>%
gather("SWEBOK", "publications", 2:16) %>%
mutate_all(replace_na, 0) %>%
group_by(Year,SWEBOK) %>%
summarise(total=sum(publications)) %>%
ggplot(aes(x=as.factor(Year), fill=SWEBOK, y=total)) + geom_bar(stat="sum") +
xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "SWEBOK Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
ggsave("PDFs/years_swebok.pdf")
## Saving 7 x 5 in image
data <- data %>% complete(Year=seq(1973,2016))
data <- data %>%
mutate(research_method = if_else(Both==1, "Mixed", if_else(Qualitative==1, "Qualitative", "Quantitative"))) %>%
filter(!is.na(research_method))
data %>% ggplot(aes(x=as.factor(Year), fill=research_method)) + geom_bar() +
scale_fill_discrete(name="Research method", labels = c("Mixed", "Qualitative", "Quantitative", "")) +
xlab("Year") + ylab("Publications") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
ggsave("PDFs/years_researchmethods.pdf")
## Saving 7 x 5 in image
data.swebok.researchmethod <- data %>%
select(swebok_areas_labels, research_method) %>%
mutate_all(replace_na,0) %>%
filter(research_method != 0) %>%
group_by(research_method) %>%
summarise_at(vars(swebok_areas_labels), sum) %>%
gather("SWEBOK", "Publications", swebok_areas_labels)
data.swebok.researchmethod %>%
ggplot(aes(x=reorder(SWEBOK, Publications, function(x){sum(x)}), y=Publications, fill=research_method)) + geom_bar(stat = "identity") +
coord_flip() + xlab("SWEBOK areas") + scale_fill_discrete(name = "Research method")
ggsave("PDFs/SWEBOK_researchmethods.pdf")
## Saving 7 x 5 in image
data %>%
filter(!is.na(Identifier)) %>%
select(Identifier, all_of(cognitive_concepts_labels), measures_labels) %>%
gather(Taxonomy, value, all_of(cognitive_concepts_labels)) %>%
filter(!is.na(value)) %>%
select(-value) %>%
gather(Method, value, measures_labels) %>%
filter(!is.na(value)) %>%
arrange(Identifier) %>%
select(-Identifier, -value) %>%
group_by(Taxonomy, Method) %>%
tally(name = "Amount") %>%
ggplot(aes(x=Method, y=Taxonomy, fill=Amount)) +
geom_point(aes(size=Amount), alpha=0.5) +
theme(legend.position = "", axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(measures_labels)` instead of `measures_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggsave("PDFs/taxonomy_methods.pdf")
## Saving 7 x 5 in image
data %>%
select(all_of(swebok_areas_labels_no_foundation), all_of(cognitive_concepts_labels)) %>%
mutate_all(replace_na,0) %>%
gather(Taxonomy, value2, cognitive_concepts_labels) %>%
add_high_level_concepts_to_data() %>%
gather(SWEBOK, value, swebok_areas_labels_no_foundation) %>%
count(SWEBOK, Concept, value, value2) %>%
mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>%
distinct(SWEBOK, Concept, freq) %>%
group_by(SWEBOK, Concept) %>%
summarize(total=sum(freq)) %>%
ungroup() %>%
ggplot(aes(fct_relevel(SWEBOK, swebok_areas_labels_no_foundation), fct_rev(Concept), fill=total)) +
geom_tile() + scale_fill_continuous(low="#fff9f7", high="red") +
xlab("SWEBOK area") + ylab("Concept") + guides(fill=guide_legend(title="")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
scale_x_discrete(labels = swebok_areas_labels_long)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels_no_foundation)` instead of `swebok_areas_labels_no_foundation` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggsave("PDFs/taxomony_swebok_cooccurences.pdf")
## Saving 7 x 5 in image
data %>%
select(cognitive_concepts_labels, measures_labels) %>%
mutate_all(replace_na,0) %>%
gather(Taxonomy, value, cognitive_concepts_labels) %>%
add_high_level_concepts_to_data() %>%
gather(Method, value2, measures_labels) %>%
count(Concept, Method, value, value2) %>%
mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>%
ggplot(aes(fct_relevel(Method, measures_labels), fct_rev(Concept), fill=freq)) +
geom_tile() +
geom_vline(xintercept = 5.5, size=0.5, color="darkgrey") +
xlab("Assessment procedure") + ylab("Concept") + guides(fill=guide_legend(title="")) +
scale_x_discrete(labels=c("Fieldwork", "Interview", "Task-based", "Open observation", "Others", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.", "Others")) + # not using measure_lables here since we need a catch-all "Others" category
annotate(geom="text", x=8, y=0.73, label="Quantitative", size=3, alpha=0.4)+ annotate(geom="text", x=3, y=0.73, label="Qualitative", size=3, alpha=0.4) +
scale_fill_continuous(low="#fff9f7", high="darkgreen") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
ggsave("PDFs/taxonomy_method_cooccurences.pdf")
## Saving 7 x 5 in image
data %>%
select(Year, cognitive_concepts_labels)%>%
gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
mutate_all(replace_na,0) %>%
mutate(publications=as.integer(publications)) %>%
group_by(Year, Taxonomy) %>%
summarise(total=sum(publications)) %>%
ggplot(aes(as.factor(Year), total, fill=Taxonomy)) + geom_bar(stat="sum") + xlab("Year") + ylab("Publications") +
scale_fill_discrete(name = "Taxonomy Areas") + guides(size = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
## Warning: NAs introduced by coercion
## Warning: Removed 1 rows containing non-finite values (stat_sum).
df.taxonomy <- data %>%
select(Year, all_of(cognitive_concepts_labels)) %>%
gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
mutate_all(replace_na,0) %>%
mutate(publications=as.integer(publications)) %>% # for some reseason recognized as char
filter(publications>0)
## Warning: NAs introduced by coercion
# need to create a separated df to hold the percentage of publications within each year
data.percentage <- df.taxonomy %>%
group_by(Year) %>%
count(Taxonomy) %>%
mutate(ratio = scales::percent(n/sum(n)))
df.taxonomy %>%
ggplot(aes(x = as.factor(Year), fill = as.factor(Taxonomy))) +
geom_bar(position="fill") +
geom_text(data = data.percentage, aes(y = n,label = ratio), position = position_fill(vjust = 0.5), colour = "white", size = 1.3) +
xlab("Year") + ylab("Publications %") +
scale_fill_discrete(name = "Topic") + guides(size = F) +
scale_y_continuous(labels = percent) +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
theme(legend.key.size = unit(.2, "cm"), legend.key.width = unit(0.2,"cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 6))
ggsave("PDFs/taxonomy_years.pdf", width = unit(10, "inch"), height = unit(6.5, "inch"))
df.concepts <- df.taxonomy %>%
add_high_level_concepts_to_data()
df.years <- data %>% filter(!(Year %in% years_no_publications)) %>% count(Year) # years without publications
ggplot() +
geom_bar(data=df.concepts, aes(x=as.factor(Year), fill=Concept), position="fill") +
geom_line(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1), size=0.8) +
geom_point(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1)) +
scale_y_continuous(labels = function(x)x*100, name="Publication %", sec.axis = sec_axis(name="Total publications", ~. * max(df.years$n), breaks=scales::breaks_extended(10))) +
xlab("Year") +
theme(panel.background = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(margin = margin(-15,0,0,0, "pt")), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, vjust = 2.4)) +
scale_fill_manual(values = NPG_modified)
ggsave("PDFs/concepts_years.pdf", width = unit(13, "inch"), height = unit(6.5, "inch"))